library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(readr)
library(viridis)
## Loading required package: viridisLite
library(e1071)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
Note: there are only few individuals of the small ciliates because only cells bigger than 1000um2 are tracked at this magnification and most small ciliates (Tetrahymena, Dexiostoma and Loxocephalus) are smaller than that.
load("../data/16x/Coleps_irchel/Morph_mvt.RData")
# temp2 <- morph_mvt
# temp2$species <- "Coleps_irchel2"
dd16 <- morph_mvt
load("../data/16x/Coleps_viridis/Morph_mvt.RData")
# dd16 <- rbind(dd16, morph_mvt)
load("../data/16x/Colpidium/Morph_mvt.RData")
morph_mvt$species <- "Colpidium"
dd16 <- rbind(dd16, morph_mvt)
# load("../data/16x/Didinium/Morph_mvt.RData")
# dd16 <- rbind(dd16, morph_mvt)
load("../data/16x/Euplotes/Morph_mvt.RData")
# morph_mvt <- morph_mvt %>% filter(mean_area>3000)
dd16 <- rbind(dd16, morph_mvt)
load("../data/16x/Euplotes_second/Morph_mvt.RData")
# morph_mvt <- morph_mvt %>% filter(mean_area>3000)
dd16 <- rbind(dd16, morph_mvt)
load("../data/16x/Paramecium_bursaria/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)
load("../data/16x/Paramecium_caudatum/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)
load("../data/16x/Stylonychia_1/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia1"
# temp <- morph_mvt
# temp$species <- "Stylonychia1_1"
dd16 <- rbind(dd16, morph_mvt)
load("../data/16x/Stylonychia_1_second/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia1"
# temp2 <- morph_mvt
# temp2$species <- "Stylonychia1_2"
dd16 <- rbind(dd16, morph_mvt)
load("../data/16x/Stylonychia_2/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia2"
# temp3 <- morph_mvt
dd16 <- rbind(dd16, morph_mvt)
load("../data/16x/Dexiostoma/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)
load("../data/16x/Loxocephallus/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)
load("../data/16x/Tetrahymena/Morph_mvt.RData")
morph_mvt$species <- "Tetrahymena"
dd16 <- rbind(dd16, morph_mvt)
## filtering
dd16 <- dd16 %>%
filter(net_disp>50, net_speed>5)
dd16$magnification <- 1.6
table(dd16$species)
##
## Coleps_irchel Colpidium Dexiostoma Euplotes
## 1298 341 10 2028
## Loxocephallus Paramecium_bursaria Paramecium_caudatum Stylonychia1
## 9 2652 2231 650
## Stylonychia2 Tetrahymena
## 375 25
dd16$species <- ifelse(dd16$species %in% c("Tetrahymena","Dexiostoma","Loxocephallus"),"Smaller_ciliates",dd16$species)
dd16$id <- 1:nrow(dd16)
dd16 <- dd16 %>% na.omit()
dd16_long <- dd16 %>%
dplyr::select(species, mean_area,sd_area,mean_perimeter,sd_perimeter,mean_major,
sd_major,mean_ar,sd_ar,
mean_turning,sd_turning,gross_disp,max_net,net_disp,net_speed,max_step,
min_step,sd_step,sd_gross_speed,max_gross_speed,min_gross_speed, id) %>%
pivot_longer(cols=-c(id,species), names_to="variable") %>%
dplyr::group_by(variable,species) %>%
mutate(iqr = IQR(value),
first_quartile = quantile(value, probs = 0.25),
third_quartile = quantile(value, probs = 0.75),
outlier = ifelse(value<first_quartile-1.5*iqr | value>third_quartile+1.5*iqr,T,F))
outliers <- dd16_long %>%
dplyr::filter(outlier==T) %>%
dplyr::group_by(species, id) %>%
dplyr::summarise(n = n()) %>%
dplyr::filter(n>3)
## `summarise()` has grouped output by 'species'. You can override using the `.groups` argument.
table(outliers$species)
##
## Coleps_irchel Colpidium Euplotes Paramecium_bursaria
## 163 12 78 237
## Paramecium_caudatum Smaller_ciliates Stylonychia1 Stylonychia2
## 78 8 81 24
nrow(outliers)/nrow(dd16)
## [1] 0.07079738
dd16_filtered <- dd16 %>%
dplyr::filter(!is.element(id,outliers$id))
dd16$removed_outliers <- F
dd16_filtered$removed_outliers <- T
dd16_comparison <- rbind(dd16,dd16_filtered)
dd16_comparison %>%
ggplot(aes((mean_area), col=removed_outliers))+
geom_histogram(aes(fill=removed_outliers, y=..density..), position = "identity", alpha=0.3) +
# geom_boxplot(outlier.alpha = 0.3) +
theme_bw() +
facet_wrap(~species, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
dd16_filtered %>%
ggplot(aes(log10(mean_area)))+
geom_density(aes(col=species))
species <- unique(dd16$species)
compositions <- read_csv("../../../compositions.csv")
## Rows: 15 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): composition
## dbl (19): richness, Chlamydomonas, Cryptomonas, Monoraphidium, Cosmarium, St...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
comp_name <- compositions$composition
compositions$Smaller_ciliates <- with(compositions,
ifelse(Tetrahymena == 1| Dexiostoma == 1| Loxocephallus == 1, 1, 0))
compositions <- compositions[,species]
compositions_list <- apply(compositions, 1, function(x){
idx <- which(x==1)
names(idx)
})
names(compositions_list) <- comp_name
classifier_plot_svm <- function(table, combination.nr){
# cm <- classifier$confusion
cm <- table
ncol <- ncol(cm)
cm <- apply(cm, 1, function(x) x/sum(x))
cm_long <- data.frame(Predicted = factor(rep(rownames(cm),ncol), levels = rownames(cm)),
Truth = factor(rep(colnames(cm), each=ncol), levels = colnames(cm)),
Classification = as.vector(cm))
plot <- cm_long %>%
ggplot(aes(Predicted,Truth,fill=Classification))+
geom_tile() +
geom_text(aes(label = round(Classification, 3)), col="white") +
scale_x_discrete(position = "top") +
scale_y_discrete(limits = rev(levels(cm_long$Truth))) +
scale_fill_viridis(option = "D", end=.8, discrete=FALSE)+
theme(axis.text.x = element_text(angle = 90, hjust = 0))+
theme(legend.text = element_text(size=14),
axis.title = element_text(size=14),
title = element_text(size=16),
axis.text = element_text(size=13))+
labs(title=paste("PPV of",combination.nr),fill="PPV")
return(plot)
}
classifier_assessment_plot <- function(cf, combination.nr){
cf.df <- cf$byClass[,1:4] %>%
as.data.frame() %>%
mutate(Species = gsub("Class: ", "", rownames(cf$byClass[,1:5]))) %>%
rename("NPV" = "Neg Pred Value", "PPV" = "Pos Pred Value",
"Sens" = "Sensitivity", "Spec" = "Specificity") %>%
pivot_longer(cols = 1:4) %>%
mutate(col = ifelse(value>=0.9,"1",
ifelse(value<0.9 & value>=0.8,"2",
ifelse(value<0.8 & value>=0.7,"3","4"))))
plot <- cf.df %>%
ggplot(aes(name,Species,fill=col))+
geom_tile() +
geom_text(aes(label = round(value, 3)), col="white") +
scale_x_discrete(position = "top") +
scale_y_discrete(limits = rev(levels(as.factor(cf.df$Species)))) +
scale_fill_manual(values = c("#006837","#66bd63","#f46d43","#a50026"), breaks = c("1","2","3","4")) +
theme(legend.text = element_text(size=14),
title = element_text(size = 16),
axis.title = element_blank(),
axis.text = element_text(size=13),
legend.position = "none")+
labs(title=combination.nr, fill="")
return(plot)
}
formula <- factor(species) ~ mean_area + sd_area + mean_perimeter + sd_perimeter + mean_major +
sd_major + mean_ar + sd_ar +
mean_turning + sd_turning + gross_disp + max_net + net_disp + net_speed + max_step +
min_step + sd_step + sd_gross_speed + max_gross_speed + min_gross_speed
trainingdata <- dd16[complete.cases(dd16), ]
set.seed(624)
trainingdata$species <- factor(trainingdata$species)
split_up <- split(trainingdata, f = trainingdata$species)
subsamples <- lapply(split_up, function(x) {
x %>% sample_n(ifelse(nrow(x) < 500, nrow(x), 500))})
trainingdata <- do.call("rbind", subsamples)
# table(sub_trainingdata$species)
# A stratified random split of the data
idx_train <- createDataPartition(trainingdata$species,
p = 0.65, # percentage of data as training
list = FALSE)
testdata <- trainingdata[-idx_train,]
trainingdata <- trainingdata[idx_train,]
obj <- tune(svm, formula, data = trainingdata, kernel="radial",
ranges = list(gamma = 2^(-8:2), cost = 2^(1:10)), probability = TRUE,
tunecontrol = tune.control(sampling = "fix", best.model = T))
cf <- caret::confusionMatrix(predict(obj$best.model, newdata=testdata %>% select(-species)),
testdata$species)
classifier_plot_svm(table = cf$table,
combination.nr = "all")
classifier_assessment_plot(cf = cf, combination.nr = "all")
There are mainly 4 measures:
Sensitivity: the probability that an individuals is classified as species X given that the it is of species X: P(Classified as species X | is of species X)
Specificity: the probability that an individuals is not classified as species X given that it is not of species X: P(Not classified as species X | is not of species X)
Positive Predictive Value PPV: the probability that an individual is of species X given that it has been classified as species X: P(is of species X | is classified as species X)
Negative Predictive Value NPV: the probability that an individuals is not of species X given that it has not been classified as species X: P(is not of species X | has not been classified as species X)
PPV is the most important for us!
formula <- factor(species) ~ mean_area + sd_area + mean_perimeter + sd_perimeter + mean_major +
sd_major + mean_ar + sd_ar +
mean_turning + sd_turning + gross_disp + max_net + net_disp + net_speed + max_step +
min_step + sd_step + sd_gross_speed + max_gross_speed + min_gross_speed
set.seed(62)
classifiers_18c_16x <- list()
classifiers_18c_16x_plots <- list()
classifiers_18c_16x_assess_plots <- list()
trainingdata <- dd16_filtered[complete.cases(dd16_filtered), ]
for(i in 1:length(compositions_list)){
sub_trainingdata <- trainingdata %>%
filter(species %in% c(compositions_list[[i]]))
n.var <- length(unique(sub_trainingdata$species))
sub_trainingdata$species <- factor(sub_trainingdata$species)
split_up <- split(sub_trainingdata, f = sub_trainingdata$species)
subsamples <- lapply(split_up, function(x) {
x %>% sample_n(ifelse(nrow(x) < 500, nrow(x), 500))})
sub_trainingdata <- do.call("rbind", subsamples)
# table(sub_trainingdata$species)
# A stratified random split of the data
idx_train <- createDataPartition(sub_trainingdata$species,
p = 0.7, # percentage of data as training
list = FALSE)
sub_testdata <- sub_trainingdata[-idx_train,]
sub_trainingdata <- sub_trainingdata[idx_train,]
n.min <- min(table(sub_trainingdata$species))
obj <- tune(svm, formula, data = sub_trainingdata, kernel="radial",
ranges = list(gamma = 2^(-8:2), cost = 2^(1:10)), probability = TRUE,
tunecontrol = tune.control(sampling = "fix", best.model = T))
classifiers_18c_16x[[i]] <- obj$best.model
cf <- caret::confusionMatrix(predict(obj$best.model, newdata=sub_testdata %>% select(-species)),
sub_testdata$species)
classifiers_18c_16x_plots[[i]] <- classifier_plot_svm(table = cf$table,
combination.nr = names(compositions_list)[[i]])
classifiers_18c_16x_assess_plots[[i]] <- classifier_assessment_plot(cf, names(compositions_list)[[i]])
}
names(classifiers_18c_16x_plots) <- names(classifiers_18c_16x) <-
names(classifiers_18c_16x_assess_plots) <- comp_name
library(patchwork)
for(i in 1:length(classifiers_18c_16x_plots)){
print(classifiers_18c_16x_plots[[i]] + classifiers_18c_16x_assess_plots[[i]] +
plot_layout(widths = c(4,2)))
}
saveRDS(classifiers_18c_16x, "svm_video_classifiers_18c_16x_trained_december2021.rds")
# cl <- readRDS("classifiers_18c_16x.rds")